library(RColorBrewer)
library(mgcv)
library(lme4)
library(mclust)
library(rstan)
library(gtools)
library(dplyr)
library(reshape2)


options(mc.cores = parallel::detectCores())

RUN_STAN=F
source('functions.R')

thresh_highSM = 0.8
thresh_lowSM = 0.2
dat_all=read.csv( file = 'dataset.csv')

writeLines(sprintf('The dataset contains %s patients with measured PfHRP2 and measured platelet counts from %s studies',
                   nrow(dat_all),
                   length(unique(dat_all$study))))
## The dataset contains 2649 patients with measured PfHRP2 and measured platelet counts from 4 studies
African_studies = c('Kilifi (Kenya)',
                    'Kampala (Uganda)',
                    'FEAST (Uganda)')
writeLines('Patients per study:')
## Patients per study:
print(table(dat_all$study))
## 
##       Bangladesh   FEAST (Uganda) Kampala (Uganda)   Kilifi (Kenya) 
##              172              567              492             1418
dat_all$study_col = brewer.pal(n = 8, name = 'Dark2')[c(3,4,5,7)][as.numeric(as.factor(dat_all$study))]
mycols = adjustcolor(brewer.pal(n = 10, 'Paired')[c(2,4,6,10)], 
                     alpha.f = .7)

Some data cleaning

hist(dat_all$hct, breaks = 50)

# get rid of outlier HCT values
ind = which(dat_all$hct <3 | dat_all$hct>50)
dat_all$hct[ind] = NA
table(!is.na(dat_all$hct))
## 
## FALSE  TRUE 
##     6  2643
## PfHRP2 deletions - samples with positive parasite count by negative on the elisa
ind = which(dat_all$hrp2==0 & dat_all$para > 1000)
table(dat_all$study[ind])
## 
##     Bangladesh FEAST (Uganda) Kilifi (Kenya) 
##              1              8             18
writeLines(sprintf('A total of %s samples have zero PfHRP2 but more than 1000 parasites per uL',length(ind)))
## A total of 27 samples have zero PfHRP2 but more than 1000 parasites per uL
dat_all = dat_all[-ind, ]

writeLines(sprintf('After excluding the HRP2 outliers, the dataset contains %s patients with measured PfHRP2 and measured platelet counts',
                   nrow(dat_all)))
## After excluding the HRP2 outliers, the dataset contains 2622 patients with measured PfHRP2 and measured platelet counts

Overview of patient characteristics

Results for Table 1

table(dat_all$study)
## 
##       Bangladesh   FEAST (Uganda) Kampala (Uganda)   Kilifi (Kenya) 
##              171              559              492             1400
xx=aggregate(age ~ study, dat_all, quantile, probs=c(0.25,.5,.75))
colnames(xx$age)=c('lower','median','upper')
xx$age = round(xx$age,1)
print(xx)
##              study age.lower age.median age.upper
## 1       Bangladesh      23.5       30.0      45.0
## 2   FEAST (Uganda)       1.2        2.0       3.3
## 3 Kampala (Uganda)       2.2        3.3       4.6
## 4   Kilifi (Kenya)       1.4        2.4       3.7
aggregate(hrp2 ~ study, dat_all, length)
##              study hrp2
## 1       Bangladesh  171
## 2   FEAST (Uganda)  559
## 3 Kampala (Uganda)  492
## 4   Kilifi (Kenya) 1400
aggregate(outcome ~ study, dat_all, function(x) round(100*mean(x),1))
##              study outcome
## 1       Bangladesh    26.9
## 2   FEAST (Uganda)    11.4
## 3 Kampala (Uganda)     6.7
## 4   Kilifi (Kenya)    11.1
aggregate(platelet ~ study, dat_all, quantile, probs=c(0.25,.5,.75))
##              study platelet.25% platelet.50% platelet.75%
## 1       Bangladesh         27.0         50.0        139.0
## 2   FEAST (Uganda)         74.5        165.0        326.0
## 3 Kampala (Uganda)         49.0         96.0        169.5
## 4   Kilifi (Kenya)         64.0        111.0        215.0
aggregate(hrp2 ~ study, dat_all, quantile, probs=c(0.25,.5,.75))
##              study  hrp2.25%  hrp2.50%  hrp2.75%
## 1       Bangladesh 1082.9050 2667.0400 6127.5550
## 2   FEAST (Uganda)    0.0000  174.7100 1952.6900
## 3 Kampala (Uganda)  588.0000 1838.4000 4097.4000
## 4   Kilifi (Kenya)  418.7393 2206.7408 5071.5299
aggregate(wbc ~ study, dat_all, quantile, probs=c(0.25,.5,.75))
##              study wbc.25% wbc.50% wbc.75%
## 1       Bangladesh   6.900   9.000  11.000
## 2   FEAST (Uganda)   8.400  11.950  18.675
## 3 Kampala (Uganda)   7.500  10.400  15.300
## 4   Kilifi (Kenya)   8.900  12.550  19.000
ind_pos = which(dat_all$malaria_positive==1)
dat_all$log10_parasites = log10(dat_all$para+50)
xx=aggregate(para ~ study, dat_all[ind_pos, ], 
             quantile, probs=c(0.25,.5,.75))
xx$para = round(xx$para)
print(xx)
##              study para.25% para.50% para.75%
## 1       Bangladesh    23550   148874   348540
## 2   FEAST (Uganda)     3640    37600   153680
## 3 Kampala (Uganda)    10635    42530   198540
## 4   Kilifi (Kenya)     6099    69824   316350
table(dat_all$study, dat_all$sickle)
##                   
##                       1    2    3
##   Bangladesh          0    0    0
##   FEAST (Uganda)    466   46   21
##   Kampala (Uganda)  463    4   23
##   Kilifi (Kenya)   1348   41    7
# xx = aggregate(log10_parasites ~ study, dat_all[ind_pos, ], mean)
# xx[,2] = round(10^xx[,2])
# print(xx)

Correlation between the two biomarkers

writeLines('African sites: correlation:')
## African sites: correlation:
ind_Africa = dat_all$study %in% African_studies
res=cor.test(log10(dat_all$platelet[ind_Africa]),
             log10(dat_all$hrp2+1)[ind_Africa])
res
## 
##  Pearson's product-moment correlation
## 
## data:  log10(dat_all$platelet[ind_Africa]) and log10(dat_all$hrp2 + 1)[ind_Africa]
## t = -31.892, df = 2449, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5690929 -0.5131183
## sample estimates:
##        cor 
## -0.5417058
log10(res$p.value)
## [1] -186.2477
round(res$estimate,2)
##   cor 
## -0.54
writeLines('Bangladesh: correlation:')
## Bangladesh: correlation:
res=cor.test(log10(dat_all$platelet[!ind_Africa]),
             log10(dat_all$hrp2+1)[!ind_Africa])
res
## 
##  Pearson's product-moment correlation
## 
## data:  log10(dat_all$platelet[!ind_Africa]) and log10(dat_all$hrp2 + 1)[!ind_Africa]
## t = -4.9404, df = 169, p-value = 1.863e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4797402 -0.2167256
## sample estimates:
##        cor 
## -0.3552439
log10(res$p.value)
## [1] -5.72988
round(res$estimate,2)
##   cor 
## -0.36

Summary plot of the biomarker data

mycols = rep('black',4)
dat_all$log10_platelet=log10(dat_all$platelet)
ind = which(dat_all$hrp2==0)
dat_all$hrp2[ind] = 10^rnorm(length(ind),0,.1)
dat_all$log10_hrp2 = log10(dat_all$hrp2)
par(las=1, family='serif', mfrow=c(2,2),cex.lab=1.5, cex.axis=1.5)
for(cc in unique(dat_all$study)){
  plot(dat_all$log10_platelet,dat_all$log10_hrp2, 
       panel.first = grid(), xaxt='n', yaxt='n',
       xlab='Platelet count (x1000 per uL)',type='n',
       ylab='PfHRP2 (ng/mL)')
  ind = dat_all$study==cc
  title(paste(cc, ' (n=',sum(ind),')',sep = ''))
  points(dat_all$log10_platelet[ind],
         dat_all$log10_hrp2[ind], pch = 16, 
         col= adjustcolor(dat_all$study_col,.1)[ind])
  points(dat_all$log10_platelet[ind],
         dat_all$log10_hrp2[ind], pch = 1, 
         col= adjustcolor(dat_all$study_col,.5)[ind])
  axis(1, at = log10(c(10, 20,100,1000)),
       labels = c(10, 20,100,1000))
  axis(1, at = log10(seq(20, 90, by =10)), labels = NA)
  axis(1, at = log10(seq(200, 900, by =100)), labels = NA)
  axis(2, at = 0:5, 
       labels = c(1, 10, expression(10^2),expression(10^3), 
                  expression(10^4),expression(10^5)))
}

Fitting a mixture model to platelets and HRP2

Two component mixture - not including FEAST

Analysis not including the FEAST study - only severe malaria studies

# make numeric matrix for stan input
ind_SMstudies = dat_all$study != 'FEAST (Uganda)' &
  !is.na(dat_all$log10_parasites)
X = dat_all[ind_SMstudies,
            c('log10_platelet',
              'log10_hrp2',
              'log10_parasites')]
site_index_SMstudies = as.numeric(as.factor(dat_all$study[ind_SMstudies]))
table(site_index_SMstudies)
## site_index_SMstudies
##    1    2    3 
##  170  484 1398
stan_dat_all = 
  list(N = nrow(X),
       y = X,
       Ntest = ncol(X),
       K_sites = max(site_index_SMstudies),
       site = site_index_SMstudies,
       prior_mu = matrix(c(log10(75), log10(200),
                           log10(250),log10(3000),
                           3,5),
                         nrow = ncol(X), byrow = T),
       prior_sigma_mu = matrix(c(0.1, 0.1,
                                 0.2, 0.2,
                                 0.25, 0.25),
                               nrow = ncol(X), byrow = T),
       beta_prior_prev = matrix(data = c(19,1,
                                         14,7,
                                         14,7),
                                nrow = max(site_index_SMstudies),
                                byrow = T),
       cluster = matrix(data = c(1,2,
                                 2,1,
                                 2,1), 
                        nrow = ncol(X), byrow = T))

print(stan_dat_all$N)
## [1] 2052
# just platelets and HRP2 (reference)
ind_SMstudies2 = dat_all$study != 'FEAST (Uganda)' 
X = dat_all[ind_SMstudies2,c('log10_platelet','log10_hrp2')]

stan_dat_plt_hrp2 = stan_dat_all
stan_dat_plt_hrp2$y = X
stan_dat_plt_hrp2$N = nrow(X)
stan_dat_plt_hrp2$site = as.numeric(as.factor(dat_all$study[ind_SMstudies2]))
stan_dat_plt_hrp2$Ntest = 2
stan_dat_plt_hrp2$prior_mu = stan_dat_all$prior_mu[1:2,]
stan_dat_plt_hrp2$prior_sigma_mu = stan_dat_all$prior_sigma_mu[1:2,]
stan_dat_plt_hrp2$cluster = stan_dat_all$cluster[1:2,]
# compile stan model
lc_mod = stan_model(file = 'Latent_class_continuous_2.stan')
fname_all = paste0('Rout/', 'mod2_LC_all.stanfit')
if(RUN_STAN | !file.exists(fname_all)){
  out_all = sampling(lc_mod, data=stan_dat_all, thin=4)
  save(out_all, file = fname_all)
} else {
  load(fname_all)
}

fname_plt_hrp2 = paste0('Rout/', 'mod2_LC_plt_hrp2.stanfit')
if(RUN_STAN | !file.exists(fname_plt_hrp2)){
  out_plt_hrp2 = sampling(lc_mod, data=stan_dat_plt_hrp2, thin=4)
  save(out_plt_hrp2, file = fname_plt_hrp2)
} else {
  load(fname_plt_hrp2)
}

traceplot(out_all,par=c('prev','mu','sigma'))

traceplot(out_plt_hrp2,par=c('prev','mu','sigma'))

thetas_all = extract(out_all)
thetas_plt_hrp2 = extract(out_plt_hrp2)

round(100*colMeans(thetas_all$prev))
## [1] 98 73 67
round(100*colMeans(thetas_plt_hrp2$prev))
## [1] 94 70 65
mean_vals = round(10^apply(thetas_all$mu, 2:3, mean))
colnames(mean_vals) = c('SM', 'not SM')
rownames(mean_vals) = c('Platelet count','PfHRP2','Parasite count')
print(mean_vals)
##                 
##                    SM not SM
##   Platelet count   73    235
##   PfHRP2          182   3247
##   Parasite count 7584  78667
mean_vals2 = round(10^apply(thetas_plt_hrp2$mu, 2:3, mean))
colnames(mean_vals2) = c('SM', 'not SM')
rownames(mean_vals2) = c('Platelet count','PfHRP2')
print(mean_vals2)
##                 
##                   SM not SM
##   Platelet count  70    231
##   PfHRP2         196   3415
hist(colMeans(thetas_all$ps_SM),breaks = 20)

ind_SM = colMeans(thetas_all$ps_SM)>thresh_highSM
ind_notSM = colMeans(thetas_all$ps_SM)<thresh_lowSM
sum(ind_SM)
## [1] 1344
sum(ind_notSM)
## [1] 463
cor.test(stan_dat_all$y[ind_SM, 1],stan_dat_all$y[ind_SM,2])
## 
##  Pearson's product-moment correlation
## 
## data:  stan_dat_all$y[ind_SM, 1] and stan_dat_all$y[ind_SM, 2]
## t = -2.7007, df = 1342, p-value = 0.007007
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1264960 -0.0201302
## sample estimates:
##         cor 
## -0.07352218
cor.test(stan_dat_all$y[ind_SM, 2],stan_dat_all$y[ind_SM,3])
## 
##  Pearson's product-moment correlation
## 
## data:  stan_dat_all$y[ind_SM, 2] and stan_dat_all$y[ind_SM, 3]
## t = -4.3847, df = 1342, p-value = 1.252e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.17122533 -0.06578923
## sample estimates:
##        cor 
## -0.1188423
cor.test(stan_dat_all$y[ind_notSM, 1],stan_dat_all$y[ind_notSM,2])
## 
##  Pearson's product-moment correlation
## 
## data:  stan_dat_all$y[ind_notSM, 1] and stan_dat_all$y[ind_notSM, 2]
## t = 2.1711, df = 461, p-value = 0.03043
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.009562577 0.189993526
## sample estimates:
##       cor 
## 0.1006052
cor.test(stan_dat_all$y[ind_notSM, 2],stan_dat_all$y[ind_notSM,3])
## 
##  Pearson's product-moment correlation
## 
## data:  stan_dat_all$y[ind_notSM, 2] and stan_dat_all$y[ind_notSM, 3]
## t = 3.1228, df = 461, p-value = 0.001904
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.05349835 0.23201405
## sample estimates:
##       cor 
## 0.1439269
round(10^apply(thetas_plt_hrp2$mu, 2:3, mean))
##       
##        [,1] [,2]
##   [1,]   70  231
##   [2,]  196 3415
round(10^(apply(thetas_plt_hrp2$mu, 2:3, mean)+ 1.96*apply(thetas_plt_hrp2$sigma, 2:3, mean)))
##       
##        [,1]  [,2]
##   [1,]  280   822
##   [2,] 5476 25279
round(10^(apply(thetas_plt_hrp2$mu, 2:3, mean)- 1.96*apply(thetas_plt_hrp2$sigma, 2:3, mean)))
##       
##        [,1] [,2]
##   [1,]   18   65
##   [2,]    7  461
par(mfrow=c(2,2), las=1)
plot(stan_dat_all$y[ind_SM, 1],stan_dat_all$y[ind_SM,2],
     xlab='Platelet count',ylab='HRP2',main='True SM',
     xlim=range(stan_dat_all$y[,1]),ylim=range(stan_dat_all$y[,2]),
     col=dat_all$study_col[ind_SM],panel.first=grid())
legend('bottomright',col = unique(dat_all$study_col), 
       legend = unique(dat_all$study),pch=1,inset=0.03)
plot(stan_dat_all$y[ind_SM, 2],stan_dat_all$y[ind_SM,3],
     xlab='HRP2', xlim=range(stan_dat_all$y[,2]),
     ylim=range(stan_dat_all$y[,3]),
     ylab='Parasite density',main='True SM',
     col=dat_all$study_col[ind_SM],panel.first=grid())
plot(stan_dat_all$y[ind_notSM, 1],stan_dat_all$y[ind_notSM,2],
     xlab='Platelet count',ylab='HRP2',main='Not SM',
     xlim=range(stan_dat_all$y[,1]),ylim=range(stan_dat_all$y[,2]),
     col=dat_all$study_col[ind_notSM],panel.first=grid())
plot(stan_dat_all$y[ind_notSM, 2],stan_dat_all$y[ind_notSM,3],
     xlim=range(stan_dat_all$y[,2]),ylim=range(stan_dat_all$y[,3]),
     xlab='HRP2',ylab='Parasite density',main='Not SM',
     col=dat_all$study_col[ind_notSM],panel.first=grid())

Sensitivity analysis - only African studies

Just use the two severe malaria studies from Africa

# make numeric matrix for stan input
ind_SMAfrica = dat_all$study != 'FEAST (Uganda)' &
  dat_all$study != 'Bangladesh' &
  !is.na(dat_all$log10_parasites)
X = dat_all[ind_SMAfrica,
            c('log10_platelet','log10_hrp2',
              'log10_parasites')]
site_index_SMAfrica = as.numeric(as.factor(dat_all$study[ind_SMAfrica]))
table(site_index_SMAfrica)
## site_index_SMAfrica
##    1    2 
##  484 1398
stan_dat_all_Af = 
  list(N = nrow(X),
       y = X,
       Ntest = ncol(X),
       K_sites = max(site_index_SMAfrica),
       site = site_index_SMAfrica,
       prior_mu = matrix(c(log10(75), log10(200),
                           log10(250),log10(3000),
                           3,5),
                         nrow = ncol(X), byrow = T),
       prior_sigma_mu = matrix(c(0.1, 0.1,
                                 0.1, 0.1,
                                 0.2, 0.2),
                               nrow = ncol(X), byrow = T),
       beta_prior_prev = matrix(data = c(14,7,
                                         14,7),
                                nrow = max(site_index_SMAfrica),
                                byrow = T),
       cluster = matrix(data = c(1,2,
                                 2,1,
                                 2,1), 
                        nrow = ncol(X), byrow = T))

# just platelets and HRP2 (reference)
stan_dat_plt_hrp2_Af = stan_dat_all_Af
stan_dat_plt_hrp2_Af$y = X[,1:2]
stan_dat_plt_hrp2_Af$Ntest = 2
stan_dat_plt_hrp2_Af$prior_mu = stan_dat_all_Af$prior_mu[1:2,]
stan_dat_plt_hrp2_Af$prior_sigma_mu =stan_dat_all_Af$prior_sigma_mu[1:2,]
stan_dat_plt_hrp2_Af$cluster = stan_dat_all_Af$cluster[1:2,]
fname_all_Af = paste0('Rout/', 'mod2_LC_all_Af.stanfit')
if(RUN_STAN | !file.exists(fname_all_Af)){
  out_all_Af = sampling(lc_mod, data=stan_dat_all_Af, thin=4)
  save(out_all_Af, file = fname_all_Af)
} else {
  load(fname_all_Af)
}

fname_plt_hrp2 = paste0('Rout/', 'mod2_LC_plt_hrp2_Af.stanfit')
if(RUN_STAN | !file.exists(fname_plt_hrp2)){
  out_plt_hrp2_Af = sampling(lc_mod, data=stan_dat_plt_hrp2_Af, thin=4)
  save(out_plt_hrp2_Af, file = fname_plt_hrp2)
} else {
  load(fname_plt_hrp2)
}

traceplot(out_all_Af)
## 'pars' not specified. Showing first 10 parameters by default.

traceplot(out_plt_hrp2_Af)
## 'pars' not specified. Showing first 10 parameters by default.

thetas_all_Af = extract(out_all_Af)
thetas_plt_hrp2_Af = extract(out_plt_hrp2_Af)

round(100*colMeans(thetas_all_Af$prev))
## [1] 71 66
round(100*colMeans(thetas_plt_hrp2_Af$prev))
## [1] 69 64
mean_vals = round(10^apply(thetas_all_Af$mu, 2:3, mean))
colnames(mean_vals) = c('SM', 'not SM')
rownames(mean_vals) = c('Platelet count','PfHRP2','Parasite count')
print(mean_vals)
##                 
##                    SM not SM
##   Platelet count   75    231
##   PfHRP2          193   3398
##   Parasite count 7617  80408
mean_vals2 = round(10^apply(thetas_plt_hrp2_Af$mu, 2:3, mean))
colnames(mean_vals2) = c('SM', 'not SM')
rownames(mean_vals2) = c('Platelet count','PfHRP2')
print(mean_vals2)
##                 
##                   SM not SM
##   Platelet count  73    228
##   PfHRP2         208   3556

ROC curves

par(mfcol=c(2,2),las=1,cex.lab=1.3,cex.axis=1.3,family='serif')
roc_analysis(thetas = thetas_all, stan_dat = stan_dat_all,
             xvals = matrix(c(log10(10),log10(1000),
                              1,5,1,6.5),
                            nrow = 3,byrow = T),
             xnames = c('Platelet count (x1000 per uL)',
                        'PfHRP2 (ng/mL)',
                        'Parasite count (per uL)'),
             cutofftype = c('upper','lower','lower'),
             mycuts = log10(c(150, 1000, 10000)))
## [1] "Platelet count (x1000 per uL)"
## [1] 84
## [1] 75
## [1] "PfHRP2 (ng/mL)"
## [1] 87
## [1] 84
## [1] "Parasite count (per uL)"
## [1] 84
## [1] 55
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter

# sensitivity analysis using African studies only
writeLines('Model using only African studies')
## Model using only African studies
roc_analysis(thetas = thetas_all_Af,
             stan_dat = stan_dat_all_Af,
             xvals = matrix(c(log10(10),log10(300),
                              1,5,1,6.3),
                            nrow = 3,byrow = T),
             xnames = c('Platelet count (x1000 per uL)',
                        'PfHRP2 (ng/mL)',
                        'Parasite count (per uL)'),
             cutofftype = c('upper','lower','lower'),
             mycuts = log10(c(150, 1000, 10000)))
## [1] "Platelet count (x1000 per uL)"
## [1] 85
## [1] 74
## [1] "PfHRP2 (ng/mL)"
## [1] 89
## [1] 83
## [1] "Parasite count (per uL)"
## [1] 85
## [1] 55
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter

writeLines('Model using only platelets and HRP2')
## Model using only platelets and HRP2
par(mfcol=c(2,2),las=1,cex.lab=1.3,cex.axis=1.3,family='serif')
roc_analysis(thetas = thetas_plt_hrp2, stan_dat = stan_dat_plt_hrp2,
             xvals = matrix(c(log10(10),log10(1000),1,5),
                            nrow = 2,byrow = T),
             xnames = c('Platelet count (x1000 per uL)',
                        'PfHRP2 (ng/mL)'),
             cutofftype = c('upper','lower'))
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter

Three component mixture model

lc3_mod = stan_model(file = 'Latent_class_continuous_3.stan')

This does not fit when we add parasite counts. So we are only using two biomarkers

# make numeric matrix for stan input
X = dat_all[, c('log10_platelet','log10_hrp2')]
site_index = as.numeric(as.factor(dat_all$study))
table(site_index, dat_all$study)
##           
## site_index Bangladesh FEAST (Uganda) Kampala (Uganda) Kilifi (Kenya)
##          1        171              0                0              0
##          2          0            559                0              0
##          3          0              0              492              0
##          4          0              0                0           1400
stan_dat = 
  list(N = nrow(X),
       y = X,
       Ntest = ncol(X),
       K_sites = max(site_index),
       site = site_index,
       prior_mu = matrix(c(log10(75), log10(200),log10(300),
                           log10(2),log10(250),log10(3000)),
                         nrow = ncol(X),ncol = 3, byrow = T),
       prior_sigma_mu = matrix(c(0.1, 0.1, 0.1,
                                 0.1, 0.1, 0.1),
                               nrow = ncol(X), ncol = 3, byrow = T),
       alpha_prior = matrix(data = c(19, 1, .1,
                                     3,  3, 3,
                                     14, 7, 1,
                                     14, 7, 1),
                            nrow = max(site_index),
                            byrow = T),
       cluster = matrix(data = c(1,2,3,
                                 3,2,1), 
                        nrow = ncol(X), ncol = 3, byrow = T))

fname_final = paste0('Rout/', 'mod3_LC_final.stanfit')
if(RUN_STAN | !file.exists(fname_final)){
  out_final = sampling(lc3_mod, data = stan_dat, iter=2000,thin=4)
  save(out_final, file = fname_final)
} else {
  load(fname_final)
}

traceplot(out_final, par=c('theta'))

thetas_final = extract(out_final)

writeLines('Estimated prevalences across the 4 studies:\n')
## Estimated prevalences across the 4 studies:
round(100*apply(thetas_final$theta[, ,1], 2, 
                quantile, probs=c(.025, 0.5, 0.975)),1)
##       [,1] [,2] [,3] [,4]
## 2.5%  87.6 30.4 64.2 61.1
## 50%   93.8 35.1 69.7 64.7
## 97.5% 97.9 39.9 74.6 68.1
c_mean=round(10^apply(thetas_final$mu, 2:3, mean))

c_upper=round(10^(apply(thetas_final$mu, 2:3, mean)+ 1.96*apply(thetas_final$sigma, 2:3, mean)))

c_lower=round(10^(apply(thetas_final$mu, 2:3, mean)- 1.96*apply(thetas_final$sigma, 2:3, mean)))

writeLines(sprintf('In SM the 95%% pred interval for the platelet count is %s to %s with a mean of %s',c_lower[1,1],c_upper[1,1],c_mean[1,1] ))
## In SM the 95% pred interval for the platelet count is 21 to 235 with a mean of 70
writeLines(sprintf('In SM the 95%% pred interval for the PfHRP2 concentration is %s to %s with a mean of %s',c_lower[2,3],c_upper[2,3],c_mean[2,3] ))
## In SM the 95% pred interval for the PfHRP2 concentration is 570 to 19718 with a mean of 3352
ind_not_FEAST = stan_dat$site != 2
plot(colMeans(thetas_final$ps_SM)[ind_not_FEAST],
     colMeans(thetas_plt_hrp2$ps_SM))
lines(0:1, 0:1)

Check conditional independence

ps = apply(thetas_final$ps_comp, 2:3, mean)
ind_SM = ps[,1]>0.5; sum(ind_SM)
## [1] 1635
ind_1 = ps[,2]>0.5; sum(ind_1)
## [1] 793
ind_2 = ps[,3]>0.5; sum(ind_2)
## [1] 194
par(mfrow=c(2,2), las=1)
plot(stan_dat$y[ind_SM, 1],stan_dat$y[ind_SM,2],
     xlab='Platelet count (x1000 per uL)',
     ylab='PfHRP2 (ng/mL)',main='Severe malaria',
     xlim=range(stan_dat$y[,1]),ylim=range(stan_dat$y[,2]),
     col = dat_all$study_col[ind_SM],panel.first=grid())
cor.test(stan_dat$y[ind_SM, 1],stan_dat$y[ind_SM,2])
## 
##  Pearson's product-moment correlation
## 
## data:  stan_dat$y[ind_SM, 1] and stan_dat$y[ind_SM, 2]
## t = -2.0615, df = 1633, p-value = 0.03941
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09918144 -0.00247591
## sample estimates:
##        cor 
## -0.0509481
legend('bottomright',col = unique(dat_all$study_col), 
       legend = unique(dat_all$study),pch=1,inset=0.03)

plot(stan_dat$y[ind_1, 1],stan_dat$y[ind_1,2],
     xlab='Platelet count (x1000 per uL)',
     ylab='PfHRP2 (ng/mL)',main='Not severe malaria (component 1)',
     xlim=range(stan_dat$y[,1]),ylim=range(stan_dat$y[,2]),
     col = dat_all$study_col[ind_1],panel.first=grid())
cor.test(stan_dat$y[ind_1, 1],stan_dat$y[ind_1,2])
## 
##  Pearson's product-moment correlation
## 
## data:  stan_dat$y[ind_1, 1] and stan_dat$y[ind_1, 2]
## t = 1.2502, df = 791, p-value = 0.2116
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.02528914  0.11367676
## sample estimates:
##        cor 
## 0.04440863
plot(stan_dat$y[ind_2, 1],stan_dat$y[ind_2,2],
     xlab='Platelet count (x1000 per uL)',
     ylab='PfHRP2 (ng/mL)',main='Not severe malaria (component 2)',
     xlim=range(stan_dat$y[,1]),ylim=range(stan_dat$y[,2]),
     col = dat_all$study_col[ind_2],panel.first=grid())
cor.test(stan_dat$y[ind_2, 1],stan_dat$y[ind_2,2])
## 
##  Pearson's product-moment correlation
## 
## data:  stan_dat$y[ind_2, 1] and stan_dat$y[ind_2, 2]
## t = -0.68593, df = 192, p-value = 0.4936
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.18900064  0.09207392
## sample estimates:
##         cor 
## -0.04944223

ROC curves

par(mfcol=c(2,2),las=1,cex.lab=1.3,cex.axis=1.3,family='serif')
roc_analysis(thetas = thetas_final, stan_dat = stan_dat,
             xvals = matrix(c(log10(10),log10(1000),1,5),
                            nrow = 2,byrow = T),
             xnames = c('Platelet count (x1000 per uL)',
                        'PfHRP2 (ng/mL)'),
             cutofftype = c('upper','lower'))
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "log" is not a graphical
## parameter

Make a P(SM) variable for subsequent analysis (continuous) and a discrete classification based on a cutoff of 0.5

dat_all$P_SM = colMeans(thetas_final$ps_SM)
dat_all$SM = as.numeric(dat_all$P_SM>0.5)
hist(dat_all$P_SM, breaks = 20,main='Final model', xlab='P(SM)')

Subsequent analysis

Relationship with the two main subphenotypes, cerebral malaria and severe malaria anaemia

writeLines('False diagnosis rate by coma status and by study:')
## False diagnosis rate by coma status and by study:
print(aggregate(SM ~ coma + study, data = dat_all, mean))
##   coma            study        SM
## 1    0       Bangladesh 0.9841270
## 2    1       Bangladesh 0.9629630
## 3    0   FEAST (Uganda) 0.3333333
## 4    1   FEAST (Uganda) 0.4311927
## 5    0 Kampala (Uganda) 0.5131579
## 6    1 Kampala (Uganda) 0.8636364
## 7    0   Kilifi (Kenya) 0.7113402
## 8    1   Kilifi (Kenya) 0.6212121
writeLines('True diagnosis rate by severe anaemia status (<15% haematocrit) and by study:')
## True diagnosis rate by severe anaemia status (<15% haematocrit) and by study:
dat_all$SMA = as.numeric(dat_all$hct<=15)
print(aggregate(SM ~ SMA + study, data = dat_all,mean))
##   SMA            study        SM
## 1   0       Bangladesh 0.9691358
## 2   1       Bangladesh 1.0000000
## 3   0   FEAST (Uganda) 0.3357488
## 4   1   FEAST (Uganda) 0.4125874
## 5   0 Kampala (Uganda) 0.8516949
## 6   1 Kampala (Uganda) 0.5625000
## 7   0   Kilifi (Kenya) 0.6199461
## 8   1   Kilifi (Kenya) 0.8280702
par(las = 1, family='serif',mfrow=c(2,2), cex.lab=1.3,
    cex.axis=1.3)
n_levels = 11
f=colorRampPalette(colors = RColorBrewer::brewer.pal(name = 'RdYlBu',n = n_levels))
mycuts = cut(x = dat_all$P_SM, breaks = seq(0,1,by=1/n_levels))
cols_ind = as.numeric(mycuts)
mycols = f(n_levels)

for(cc in unique(dat_all$study)){
  ind = dat_all$study==cc
  plot(X, type='n',
       panel.first=grid(), 
       xlab='Platelet count (x1000 per uL)', 
       ylab='PfHRP2 (ng/mL)', xaxt='n', yaxt='n')
  points(X[ind, ],  pch=1,
         col= adjustcolor(mycols[cols_ind],.7)[ind])
  points(X[ind, ],  pch=16,
         col= adjustcolor(mycols[cols_ind],.4)[ind])
  if(cc %in% African_studies){
    ind_AS = dat_all$sickle==2
    points(X[ind&ind_AS, ],pch=2)
    ind_SS = dat_all$sickle==3
    points(X[ind&ind_SS, ],pch=3)
  }
  title(cc)
  axis(1, at = log10(c(10, 20,100,200,1000)),
       labels = c(10, 20,100,200,1000))
  axis(1, at = log10(seq(20, 100, by =10)), 
       labels = NA)
  axis(1, at = log10(seq(200, 1000, by =100)), 
       labels = NA)
  axis(2, at = 0:5, 
       labels = c(1, 10, 
                  expression(10^2),
                  expression(10^3), 
                  expression(10^4),
                  expression(10^5)))
  
}
table(sickle=dat_all$sickle, 
      class=dat_all$SM, 
      study=dat_all$study)
## , , study = Bangladesh
## 
##       class
## sickle   0   1
##      1   0   0
##      2   0   0
##      3   0   0
## 
## , , study = FEAST (Uganda)
## 
##       class
## sickle   0   1
##      1 284 182
##      2  40   6
##      3  20   1
## 
## , , study = Kampala (Uganda)
## 
##       class
## sickle   0   1
##      1 126 337
##      2   2   2
##      3  19   4
## 
## , , study = Kilifi (Kenya)
## 
##       class
## sickle   0   1
##      1 437 911
##      2  27  14
##      3   6   1
lgd_ = rep(NA, n_levels)
lgd_[c(1,6,11)] = c(1,0.5,0)
legend('bottomleft',
       legend = lgd_,
       fill = rev(mycols),bty='n',x.intersp =.5,
       border = NA,inset = 0.01,
       y.intersp = 0.5,title = '',
       cex = 1, text.font = 1)

legend('bottomright',pch=2:3, legend = c('HbAS','HbSS'),inset=0.03)

### Parasite counts
ind_neg = dat_all$malaria_positive
par(las = 1, family='serif',mfrow=c(2,2), cex.lab=1.3, cex.axis=1.3)
for(cc in unique(dat_all$study)){
  ind = dat_all$study==cc
  set.seed(7475)
  myxs=dat_all$P_SM[ind]
  myys=log10(dat_all$para+10^1.7)[ind]
  plot(myxs,myys, panel.first=grid(), ylim =c(1.5,6.5),
       pch=1, xlim=c(0,1),
       col = adjustcolor('black',.4),
       xlab='Prob(Severe Malaria)', 
       ylab='Parasite density per uL',yaxt='n')
  points(myxs,myys,pch=16,col = adjustcolor('black',.1))
  # axis(1, at=1:3,labels = c('A','B','C'), tick = F)
  axis(2, at = 2:6, 
       labels = c(expression(10^2),
                  expression(10^3), 
                  expression(10^4),
                  expression(10^5),
                  expression(10^6)))
  mod=gam(myys ~ s(myxs,k=3),
          data=data.frame(myys=myys,myxs=myxs))
  pxs=seq(0,1,length.out = 100)
  lines(pxs,predict(mod, data.frame(myxs=pxs)),lwd=3)
  title(cc)
}

Haematocrit

par(las = 1, family='serif',mfrow=c(2,2), cex.lab=1.3, cex.axis=1.3)
for(cc in unique(dat_all$study)){
  ind = dat_all$study==cc
  set.seed(7475)
  myxs=(dat_all$P_SM)[ind]
  myys=jitter(dat_all$hct[ind], amount = .1)
  plot(myxs,myys,panel.first=grid(), 
       ylim = range(dat_all$hct,na.rm = T),
       pch=1, xlim=c(0,1),
       col = adjustcolor('black',.4),
       xlab='Prob(Severe Malaria)',
       ylab='Haematocrit (%)')
  points(myxs,myys,pch=16,col = adjustcolor('black',.1))
  mod=gam(myys ~ s(myxs,k=3),
          data=data.frame(myys=myys,myxs=myxs))
  pxs=seq(0,1,length.out = 100)
  lines(pxs,predict(mod, data.frame(myxs=pxs)),lwd=3)
  title(cc)
}

hct_subgroups = aggregate(hct ~ SM + study, 
                          dat_all, median)
print(hct_subgroups)
##   SM            study  hct
## 1  0       Bangladesh 35.0
## 2  1       Bangladesh 27.3
## 3  0   FEAST (Uganda) 30.0
## 4  1   FEAST (Uganda) 19.2
## 5  0 Kampala (Uganda) 12.2
## 6  1 Kampala (Uganda) 15.8
## 7  0   Kilifi (Kenya) 27.5
## 8  1   Kilifi (Kenya) 19.4
# Severe anaemia
SA_subgroups = aggregate(hct ~ SM + study, 
                         dat_all, function(x) round(100*mean(x<15),1))
print(SA_subgroups)
##   SM            study  hct
## 1  0       Bangladesh  0.0
## 2  1       Bangladesh  4.2
## 3  0   FEAST (Uganda) 21.7
## 4  1   FEAST (Uganda) 27.8
## 5  0 Kampala (Uganda) 76.2
## 6  1 Kampala (Uganda) 41.7
## 7  0   Kilifi (Kenya) 10.2
## 8  1   Kilifi (Kenya) 24.6
ind_SM = dat_all$P_SM > thresh_highSM
ind_notSM = dat_all$P_SM < thresh_lowSM
par(mfcol=c(2,3))
for(cc in c("FEAST (Uganda)","Kampala (Uganda)", "Kilifi (Kenya)")){
  ind = dat_all$study==cc
  
  hist(dat_all$hct[ind_SM&ind], breaks = seq(0,50, by=1),
       xlab='Haematocrit (%)', ylab='Number of patients',
       main = paste0(cc,': ', 'P(SM)>0.8'),
       panel.first=grid())
  abline(v=median(dat_all$hct[ind_SM&ind],na.rm=T),lwd=2, col='red')
  print(median(dat_all$hct[ind_SM&ind],na.rm=T))
  hist(dat_all$hct[ind_notSM&ind], breaks = seq(0,50, by=1),
       xlab='Haematocrit (%)', ylab='Number of patients',
       main = paste0(cc,': ', 'P(SM)<0.2'),
       panel.first=grid())
  abline(v=median(dat_all$hct[ind_notSM&ind],na.rm=T),lwd=2, col='red')
  print(median(dat_all$hct[ind_notSM&ind],na.rm=T))
}
## [1] 19.2
## [1] 31
## [1] 17
## [1] 11.6
## [1] 19.4

## [1] 28.35

P(Severe malaria) and mortality

par(las = 1, family='serif',mfrow=c(2,2), cex.lab=1.3, cex.axis=1.3)
for(cc in unique(dat_all$study)){
  ind = dat_all$study==cc
  set.seed(7475)
  xs = seq(0,1,length.out = 100)
  mod=glm(outcome ~ P_SM, family='binomial', data = dat_all[ind, ])
  print(summary(mod))
  preds=predict(mod,newdata = data.frame(P_SM=xs),se.fit = T)
  plot(xs,100*inv.logit(preds$fit),panel.first=grid(), 
       type='l',lwd=2,  col = adjustcolor('black',.4),
       xlab='Prob(Severe Malaria)',
       ylim = range(100*inv.logit(c(preds$fit+1.96*preds$se.fit,
                                    rev(preds$fit-1.96*preds$se.fit)))),
       ylab='Probability of death (%)')
  polygon(x = c(xs, rev(xs)), 
          y = 100*inv.logit(c(preds$fit+1.96*preds$se.fit,
                              rev(preds$fit-1.96*preds$se.fit))),
          col=adjustcolor('grey', .3), border = NA)
  title(cc)
}
## 
## Call:
## glm(formula = outcome ~ P_SM, family = "binomial", data = dat_all[ind, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5023  -0.5023  -0.4983  -0.4766   2.1132  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.0067     0.1693 -11.854   <2e-16 ***
## P_SM         -0.1129     0.3111  -0.363    0.717    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 397.79  on 558  degrees of freedom
## Residual deviance: 397.65  on 557  degrees of freedom
## AIC: 401.65
## 
## Number of Fisher Scoring iterations: 4
## 
## Call:
## glm(formula = outcome ~ P_SM, family = "binomial", data = dat_all[ind, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4602  -0.4568  -0.4184  -0.1625   3.0445  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.6411     0.7967  -5.826 5.69e-09 ***
## P_SM          2.4517     0.8583   2.856  0.00428 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 242.07  on 491  degrees of freedom
## Residual deviance: 228.40  on 490  degrees of freedom
## AIC: 232.4
## 
## Number of Fisher Scoring iterations: 7
## 
## Call:
## glm(formula = outcome ~ P_SM, family = "binomial", data = dat_all[ind, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5560  -0.5223  -0.4482  -0.4442   2.1756  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.7887     0.1435 -12.465   <2e-16 ***
## P_SM         -0.4800     0.1985  -2.418   0.0156 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 974.42  on 1399  degrees of freedom
## Residual deviance: 968.68  on 1398  degrees of freedom
## AIC: 972.68
## 
## Number of Fisher Scoring iterations: 4
## 
## Call:
## glm(formula = outcome ~ P_SM, family = "binomial", data = dat_all[ind, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8060  -0.8059  -0.8013   1.6016   1.7483  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -1.6335     1.1108  -1.471    0.141
## P_SM          0.6758     1.1645   0.580    0.562
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 199.14  on 170  degrees of freedom
## Residual deviance: 198.77  on 169  degrees of freedom
## AIC: 202.77
## 
## Number of Fisher Scoring iterations: 4

Summary of lab values by inferred subgroup

age_subgroups = aggregate(age ~ SM + study, 
                          dat_all, function(x) round(median(x),1))
print(age_subgroups)
##   SM            study  age
## 1  0       Bangladesh 26.0
## 2  1       Bangladesh 30.0
## 3  0   FEAST (Uganda)  1.8
## 4  1   FEAST (Uganda)  2.4
## 5  0 Kampala (Uganda)  3.3
## 6  1 Kampala (Uganda)  3.3
## 7  0   Kilifi (Kenya)  2.3
## 8  1   Kilifi (Kenya)  2.4
para_subgroups = aggregate(log10_parasites ~ SM + study, 
                           dat_all, function(x) round(mean(x),1))
para_subgroups$log10_parasites=round(10^para_subgroups$log10_parasites)
print(para_subgroups)
##   SM            study log10_parasites
## 1  0       Bangladesh          100000
## 2  1       Bangladesh           79433
## 3  0   FEAST (Uganda)             316
## 4  1   FEAST (Uganda)           39811
## 5  0 Kampala (Uganda)           12589
## 6  1 Kampala (Uganda)           50119
## 7  0   Kilifi (Kenya)           12589
## 8  1   Kilifi (Kenya)           79433
dat_all$log10_wbc = log10(dat_all$wbc)
wbc_subgroups = aggregate(log10_wbc ~ SM + study, 
                          dat_all, function(x) round(mean(x),1))
wbc_subgroups$log10_wbc=round(10^wbc_subgroups$log10_wbc)
print(wbc_subgroups)
##   SM            study log10_wbc
## 1  0       Bangladesh         8
## 2  1       Bangladesh         8
## 3  0   FEAST (Uganda)        13
## 4  1   FEAST (Uganda)        10
## 5  0 Kampala (Uganda)        13
## 6  1 Kampala (Uganda)        10
## 7  0   Kilifi (Kenya)        16
## 8  1   Kilifi (Kenya)        13
hbb_subgroups = aggregate(sickle ~ SM + study, 
                          dat_all, table)
print(hbb_subgroups)
##   SM            study sickle.1 sickle.2 sickle.3
## 1  0   FEAST (Uganda)      284       40       20
## 2  1   FEAST (Uganda)      182        6        1
## 3  0 Kampala (Uganda)      126        2       19
## 4  1 Kampala (Uganda)      337        2        4
## 5  0   Kilifi (Kenya)      437       27        6
## 6  1   Kilifi (Kenya)      911       14        1
death_subgroups = aggregate(outcome ~ SM + study, 
                            dat_all, function(x) round(100*mean(x),1))
print(death_subgroups)
##   SM            study outcome
## 1  0       Bangladesh     0.0
## 2  1       Bangladesh    27.7
## 3  0   FEAST (Uganda)    11.6
## 4  1   FEAST (Uganda)    11.1
## 5  0 Kampala (Uganda)     1.4
## 6  1 Kampala (Uganda)     9.0
## 7  0   Kilifi (Kenya)    13.9
## 8  1   Kilifi (Kenya)     9.6
par(las = 1, family='serif',mfrow=c(2,2), cex.lab=1.3, cex.axis=1.3)
for(cc in unique(dat_all$study)){
  ind = dat_all$study==cc
  set.seed(7475)
  myxs=(dat_all$P_SM)[ind]
  myys=log10(dat_all$wbc[ind])
  plot(myxs,myys,panel.first=grid(), 
       ylim = range(log10(dat_all$wbc),na.rm = T),
       pch=1, xlim=c(0,1), yaxt='n',
       col = adjustcolor('black',.4),
       xlab='Prob(Severe Malaria)',
       ylab='White blood cell count (x1000 per uL)')
  axis(2, at = log10(c(1,3,10,30,100)), labels = c(1,3,10,30,100))
  points(myxs,myys,pch=16,col = adjustcolor('black',.1))
  mod=gam(myys ~ s(myxs,k=3),
          data=data.frame(myys=myys,myxs=myxs))
  pxs=seq(0,1,length.out = 100)
  lines(pxs,predict(mod, data.frame(myxs=pxs)),lwd=3)
  title(cc)
}

mod_wbc = lm(log10_wbc ~ study + SM + age, data = dat_all)
xx=summary(mod_wbc)
xx$coefficients
##                           Estimate  Std. Error    t value      Pr(>|t|)
## (Intercept)            1.287494636 0.045524542  28.281331 9.516183e-154
## studyFEAST (Uganda)   -0.124753261 0.043503756  -2.867643  4.168538e-03
## studyKampala (Uganda) -0.142630933 0.042142174  -3.384518  7.235529e-04
## studyKilifi (Kenya)   -0.069474443 0.042020916  -1.653330  9.838379e-02
## SM                    -0.121163979 0.010854018 -11.163053  2.669766e-28
## age                   -0.006631417 0.001184684  -5.597624  2.399319e-08
10^(-xx$coefficients['SM',1])
## [1] 1.321795
mod_wbc2 = gam(as.numeric(wbc>15) ~ study + SM + s(age,k=3),
               family='binomial',
               data = dat_all)
xx=summary(mod_wbc2)
xx$p.pv
##           (Intercept)   studyFEAST (Uganda) studyKampala (Uganda) 
##          3.872369e-05          4.663045e-07          1.014519e-07 
##   studyKilifi (Kenya)                    SM 
##          9.244550e-06          2.111265e-17
round(exp(xx$coefficients['SM',1] + c(-1,0,1)*xx$coefficients['SM',2]),2)
## numeric(0)
table(dat_all$study, dat_all$wbc > 15)
##                   
##                    FALSE TRUE
##   Bangladesh         149   22
##   FEAST (Uganda)     354  204
##   Kampala (Uganda)   365  126
##   Kilifi (Kenya)     870  530

Bacteraemia

Relationship between the blood culture data and the probability of severe malaria. We also look at whether WBC values predict bacteraemia

mod=glm(BS ~ SM, family='binomial',data = dat_all)
table(dat_all$SM, dat_all$BS)
##    
##       0   1
##   0 445  29
##   1 904  22
summary(mod)
## 
## Call:
## glm(formula = BS ~ SM, family = "binomial", data = dat_all)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3553  -0.3553  -0.2193  -0.2193   2.7349  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.7308     0.1917 -14.249  < 2e-16 ***
## SM           -0.9850     0.2886  -3.413 0.000642 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 437.98  on 1399  degrees of freedom
## Residual deviance: 426.26  on 1398  degrees of freedom
##   (1222 observations deleted due to missingness)
## AIC: 430.26
## 
## Number of Fisher Scoring iterations: 6
xx=summary(mod)$coefficients
exp(xx[2,1] + c(-1,0,1)*1.96* xx[2,2])
## [1] 0.2121076 0.3734361 0.6574707
summary(glm(BS ~ log10_wbc, family='binomial', data = dat_all))
## 
## Call:
## glm(formula = BS ~ log10_wbc, family = "binomial", data = dat_all)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2857  -0.2745  -0.2719  -0.2697   2.6151  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.3902     0.6290  -5.390 7.06e-08 ***
## log10_wbc     0.1024     0.5442   0.188    0.851    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 437.98  on 1399  degrees of freedom
## Residual deviance: 437.95  on 1398  degrees of freedom
##   (1222 observations deleted due to missingness)
## AIC: 441.95
## 
## Number of Fisher Scoring iterations: 6
summary(glm(BS ~ log10_wbc, family='binomial', 
            data = dat_all[dat_all$P_SM>0.8,]))
## 
## Call:
## glm(formula = BS ~ log10_wbc, family = "binomial", data = dat_all[dat_all$P_SM > 
##     0.8, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2379  -0.1912  -0.1789  -0.1654   2.9721  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -3.1534     1.2629  -2.497   0.0125 *
## log10_wbc    -0.9213     1.2119  -0.760   0.4471  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 133.13  on 806  degrees of freedom
## Residual deviance: 132.53  on 805  degrees of freedom
##   (630 observations deleted due to missingness)
## AIC: 136.53
## 
## Number of Fisher Scoring iterations: 7
summary(glm(BS ~ log10_wbc, family='binomial', 
            data = dat_all[dat_all$P_SM<0.2,]))
## 
## Call:
## glm(formula = BS ~ log10_wbc, family = "binomial", data = dat_all[dat_all$P_SM < 
##     0.2, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4363  -0.3724  -0.3506  -0.3242   2.5366  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -1.9199     0.9723  -1.975   0.0483 *
## log10_wbc    -0.6903     0.8159  -0.846   0.3975  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 173.34  on 377  degrees of freedom
## Residual deviance: 172.62  on 376  degrees of freedom
##   (435 observations deleted due to missingness)
## AIC: 176.62
## 
## Number of Fisher Scoring iterations: 5

Validation using sickle trait

ind = dat_all$sickle%in% 1:2
mod=glm(SM ~ sickle + study, family='binomial',
        data.frame(SM=dat_all$SM[ind],
                   sickle=as.numeric(dat_all$sickle==2)[ind],
                   study = as.factor(dat_all$study)[ind]))
xx=summary(mod)$coefficients
xx
##                         Estimate Std. Error   z value     Pr(>|z|)
## (Intercept)           -0.4480304 0.09350842 -4.791338 1.656729e-06
## sickle                -1.3855045 0.25720068 -5.386862 7.169854e-08
## studyKampala (Uganda)  1.4361129 0.13963506 10.284759 8.254826e-25
## studyKilifi (Kenya)    1.1824618 0.10908370 10.839950 2.225938e-27
round(exp(xx['sickle','Estimate']+(c(-1,0,1)*1.96*xx['sickle','Std. Error'])),2)
## [1] 0.15 0.25 0.41
for(ss in African_studies){
  ind = dat_all$study==ss & dat_all$sickle%in% 1:2
  mod=glm(SM ~ sickle, family='binomial',
          data.frame(SM=dat_all$SM[ind],
                     sickle=as.numeric(dat_all$sickle==2)[ind]))
  xx=summary(mod)$coefficients
  print(ss)
  print(round(exp(xx['sickle','Estimate']+(c(-1,0,1)*1.96*xx['sickle','Std. Error'])),2))
}
## [1] "Kilifi (Kenya)"
## [1] 0.13 0.25 0.48
## [1] "Kampala (Uganda)"
## [1] 0.05 0.37 2.68
## [1] "FEAST (Uganda)"
## [1] 0.10 0.23 0.56
ind = dat_all$sickle%in% c(1,3)
table(dat_all$SM, dat_all$sickle, dat_all$study)
## , ,  = Bangladesh
## 
##    
##       1   2   3
##   0   0   0   0
##   1   0   0   0
## 
## , ,  = FEAST (Uganda)
## 
##    
##       1   2   3
##   0 284  40  20
##   1 182   6   1
## 
## , ,  = Kampala (Uganda)
## 
##    
##       1   2   3
##   0 126   2  19
##   1 337   2   4
## 
## , ,  = Kilifi (Kenya)
## 
##    
##       1   2   3
##   0 437  27   6
##   1 911  14   1
mod=glm(SM ~ sickle + study, family='binomial',
        data.frame(SM=dat_all$SM[ind],
                   sickle=as.numeric(dat_all$sickle==3)[ind],
                   study = as.factor(dat_all$study)[ind]))
xx=summary(mod)$coefficients
xx
##                         Estimate Std. Error   z value     Pr(>|z|)
## (Intercept)           -0.4450513 0.09462094 -4.703518 2.557161e-06
## sickle                -2.5409675 0.44759858 -5.676889 1.371664e-08
## studyKampala (Uganda)  1.4288183 0.14000062 10.205800 1.867760e-24
## studyKilifi (Kenya)    1.1797031 0.11100124 10.627837 2.211831e-26
round(exp(xx['sickle','Estimate']+(c(-1,0,1)*1.96*xx['sickle','Std. Error'])),2)
## [1] 0.03 0.08 0.19
for(ss in African_studies){
  ind = dat_all$study==ss & dat_all$sickle%in% c(1,3)
  mod=glm(SM ~ sickle, family='binomial',
          data.frame(SM=dat_all$SM[ind],
                     sickle=as.numeric(dat_all$sickle==3)[ind]))
  xx=summary(mod)$coefficients
  print(ss)
  print(round(exp(xx['sickle','Estimate']+(c(-1,0,1)*1.96*xx['sickle','Std. Error'])),2))
}
## [1] "Kilifi (Kenya)"
## [1] 0.01 0.08 0.67
## [1] "Kampala (Uganda)"
## [1] 0.03 0.08 0.24
## [1] "FEAST (Uganda)"
## [1] 0.01 0.08 0.59
# Additive model
mod=glm(SM ~ sickle + study, family='binomial',
        data.frame(SM=dat_all$SM,
                   sickle=dat_all$sickle,
                   study = as.factor(dat_all$study)))
xx=summary(mod)$coefficients
xx
##                         Estimate Std. Error   z value     Pr(>|z|)
## (Intercept)            0.8716288  0.2050853  4.250080 2.136943e-05
## sickle                -1.3219401  0.1722037 -7.676607 1.633571e-14
## studyKampala (Uganda)  1.4410207  0.1378247 10.455464 1.383197e-25
## studyKilifi (Kenya)    1.1831195  0.1086905 10.885220 1.355712e-27
round(exp(xx['sickle','Estimate']+(c(-1,0,1)*1.96*xx['sickle','Std. Error'])),2)
## [1] 0.19 0.27 0.37

other clustering approaches - mclust

mclust

dat_all$log10_para = log10(dat_all$para+50)
dat_all$log10_wbc = log10(dat_all$wbc)
ind = !is.na(dat_all$log10_hrp2)&!is.na(dat_all$log10_platelet)
sum(ind)
## [1] 2622
table(dat_all$study[ind])
## 
##       Bangladesh   FEAST (Uganda) Kampala (Uganda)   Kilifi (Kenya) 
##              171              559              492             1400
X = dat_all[ind, c('log10_hrp2','log10_platelet')]
mclust_mods = Mclust(data = X, G=3)
summary(mclust_mods)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 3
## components: 
## 
##  log-likelihood    n df       BIC       ICL
##        -4041.15 2622 17 -8216.118 -9016.347
## 
## Clustering table:
##    1    2    3 
##  865 1563  194
table(dat_all$study[ind],mclust_mods$classification)
##                   
##                      1   2   3
##   Bangladesh        27 144   0
##   FEAST (Uganda)   173 195 191
##   Kampala (Uganda) 171 321   0
##   Kilifi (Kenya)   494 903   3
table(dat_all$sickle[ind],mclust_mods$classification)
##    
##        1    2    3
##   1  752 1383  142
##   2   41   21   29
##   3   31    5   15
comp_cols = brewer.pal(n = mclust_mods$G, name = 'Dark2')
pairs(X, col = adjustcolor(comp_cols[mclust_mods$classification],.4),
      pch=16)

Compare with previously published probabilities

model1_probs = read.csv(file = 'Kilifi_Model1_Probabilities.csv')
model1_probs = model1_probs %>% arrange(sample_code) %>% filter(sample_code %in% dat_all$IDs)
dat_Kilifi = dat_all %>% filter(study=='Kilifi (Kenya)')%>%arrange(IDs)

par(las=1, family='serif', cex.lab=1.3, cex.axis=1.3)
plot(model1_probs$P_SM1, dat_Kilifi$P_SM,
     xlab='Prob(Severe Malaria): platelet/white count model',
     ylab='Prob(Severe Malaria): platelet/PfHRP2 model',
     panel.first=grid(), main = 'Kilifi (Kenya)')
abline(lm(dat_Kilifi$P_SM~model1_probs$P_SM1), col='red',lwd=3)

hist(model1_probs$P_SM1 - dat_Kilifi$P_SM)

cor.test(model1_probs$P_SM1, dat_Kilifi$P_SM)
## 
##  Pearson's product-moment correlation
## 
## data:  model1_probs$P_SM1 and dat_Kilifi$P_SM
## t = 36.21, df = 1398, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6676275 0.7217688
## sample estimates:
##       cor 
## 0.6956848

Save output

# dat_kemri = dat_all[dat_all$study=='Kilifi (Kenya)', ]
# save(dat_kemri, file = 'kemri_PSM.RData')

Optimal combination of thresholds

We use the model of platelet counts and HRP2, not including the FEAST study (this is more conservative in its estimated operating characteristics)

source('functions.R')
x1=seq(100,200,by=1)
x2=seq(500, 1500, by=1)
my_thresh=expand.grid(plt_thresh = x1, 
                      hrp2_thresh = x2)
my_thresh$sens = my_thresh$sp = NA
for(k in 1:nrow(my_thresh)){
  out=roc_output(thetas = thetas_plt_hrp2,
                 stan_dat = stan_dat_plt_hrp2, 
                 cutofftype = c('upper','lower'), 
                 mycuts = log10(unlist(my_thresh[k,1:2])))
  my_thresh$sens[k] = out[1]
  my_thresh$sp[k] = out[2]
}
my_thresh$FP = round(100*(1-my_thresh$sp), 2)
my_thresh$TP = round(100*my_thresh$sens, 2)

xs = unique(my_thresh$FP)
ys = array(dim = length(xs))
for(i in 1:length(xs)){
  ind = my_thresh$FP == xs[i]
  ys[i] = max(my_thresh$TP[ind])
}

my_fp = 5.2
my_tp = 79

Plot results

f=smooth.spline(x = xs, y = ys,df = 5)
par(las=1, family='serif', cex.lab=1.3, cex.axis=1.3,mfrow=c(2,2))


z_FP = dcast(data = my_thresh, formula = hrp2_thresh~plt_thresh, 
             value.var='FP')
range(z_FP[, -1])
## [1]  1.1 12.0
image(y = x1, x=x2, z=as.matrix(z_FP[, -1]),
      breaks = seq(0,15,length.out = 16),
      main='False positive rate',yaxt='n',
      col=hcl.colors(15, "YlGn", rev = TRUE),
      xlab='PfHRP2 (ng/mL)',ylab='Platelet count (x1000 per uL)')
axis(2, at = c(100,150,200))
grid()
abline(v=800, h=150,lty=2)

z_TP = dcast(data = my_thresh, formula = hrp2_thresh~plt_thresh, value.var='TP')
range(z_TP[, -1])
## [1] 54.5 90.3
image(y = x1, x=x2, z=as.matrix(z_TP[, -1]),
      breaks = seq(50,91,length.out = 16),panel.first=grid(),
      main='True positive rate',yaxt='n',
      col=hcl.colors(15, "Blue-Red 3", rev = TRUE),
      xlab='PfHRP2 (ng/mL)',ylab='Platelet count (x1000 per uL)')
axis(2, at = c(100,150,200))
grid()
abline(v=800, h=150,lty=2)

plot(f$x, f$y, panel.first=grid(),
     xlab = 'False positive rate (%)', 
     ylab = 'True positive rate (%)',
     xlim = c(0,15), ylim = c(50, 100), type='l',lwd=2)
abline(v=5, h=80,lty=2)
ind = my_thresh$FP < my_fp & my_thresh$TP > my_tp

writeLines(sprintf('There are %s combinations that have a true positive rate greater than %s and a false positive rate less than %s', 
                   sum(ind), my_tp, my_fp))
## There are 62 combinations that have a true positive rate greater than 79 and a false positive rate less than 5.2
print(my_thresh[ind, ])
##       plt_thresh hrp2_thresh        sp      sens  FP   TP
## 22871        144         726 0.9485039 0.7905506 5.1 79.1
## 24185        145         739 0.9485109 0.7908446 5.1 79.1
## 24286        145         740 0.9485663 0.7906988 5.1 79.1
## 24387        145         741 0.9486216 0.7905530 5.1 79.1
## 25499        146         752 0.9485189 0.7910760 5.1 79.1
## 25600        146         753 0.9485737 0.7909287 5.1 79.1
## 25701        146         754 0.9486283 0.7907813 5.1 79.1
## 25802        146         755 0.9486829 0.7906338 5.1 79.1
## 26813        147         765 0.9485279 0.7912465 5.1 79.1
## 26914        147         766 0.9485820 0.7910977 5.1 79.1
## 27015        147         767 0.9486361 0.7909487 5.1 79.1
## 27116        147         768 0.9486900 0.7907997 5.1 79.1
## 27217        147         769 0.9487438 0.7906506 5.1 79.1
## 27318        147         770 0.9487976 0.7905014 5.1 79.1
## 28127        148         778 0.9485379 0.7913574 5.1 79.1
## 28228        148         779 0.9485915 0.7912071 5.1 79.1
## 28329        148         780 0.9486449 0.7910567 5.1 79.1
## 28430        148         781 0.9486982 0.7909063 5.1 79.1
## 28531        148         782 0.9487514 0.7907557 5.1 79.1
## 28632        148         783 0.9488046 0.7906051 5.1 79.1
## 29441        149         791 0.9485491 0.7914102 5.1 79.1
## 29542        149         792 0.9486020 0.7912585 5.1 79.1
## 29643        149         793 0.9486549 0.7911067 5.1 79.1
## 29744        149         794 0.9487076 0.7909549 5.1 79.1
## 29845        149         795 0.9487602 0.7908029 5.1 79.1
## 29946        149         796 0.9488128 0.7906509 5.1 79.1
## 30654        150         803 0.9485090 0.7915594 5.1 79.2
## 30755        150         804 0.9485614 0.7914064 5.1 79.1
## 30856        150         805 0.9486137 0.7912533 5.1 79.1
## 30957        150         806 0.9486660 0.7911002 5.1 79.1
## 31058        150         807 0.9487182 0.7909470 5.1 79.1
## 31159        150         808 0.9487702 0.7907937 5.1 79.1
## 31260        150         809 0.9488222 0.7906403 5.1 79.1
## 31968        151         816 0.9485230 0.7915016 5.1 79.2
## 32069        151         817 0.9485749 0.7913473 5.1 79.1
## 32170        151         818 0.9486267 0.7911929 5.1 79.1
## 32271        151         819 0.9486784 0.7910384 5.1 79.1
## 32372        151         820 0.9487299 0.7908839 5.1 79.1
## 32473        151         821 0.9487814 0.7907293 5.1 79.1
## 32574        151         822 0.9488328 0.7905747 5.1 79.1
## 33282        152         829 0.9485383 0.7913898 5.1 79.1
## 33383        152         830 0.9485896 0.7912342 5.1 79.1
## 33484        152         831 0.9486408 0.7910786 5.1 79.1
## 33585        152         832 0.9486920 0.7909229 5.1 79.1
## 33686        152         833 0.9487430 0.7907671 5.1 79.1
## 33787        152         834 0.9487939 0.7906112 5.1 79.1
## 34495        153         841 0.9485040 0.7913821 5.1 79.1
## 34596        153         842 0.9485549 0.7912254 5.1 79.1
## 34697        153         843 0.9486056 0.7910686 5.1 79.1
## 34798        153         844 0.9486563 0.7909117 5.1 79.1
## 34899        153         845 0.9487069 0.7907548 5.1 79.1
## 35000        153         846 0.9487573 0.7905978 5.1 79.1
## 35809        154         854 0.9485223 0.7911675 5.1 79.1
## 35910        154         855 0.9485727 0.7910096 5.1 79.1
## 36011        154         856 0.9486229 0.7908517 5.1 79.1
## 36112        154         857 0.9486730 0.7906937 5.1 79.1
## 36213        154         858 0.9487230 0.7905356 5.1 79.1
## 37123        155         867 0.9485420 0.7909029 5.1 79.1
## 37224        155         868 0.9485917 0.7907439 5.1 79.1
## 37325        155         869 0.9486414 0.7905848 5.1 79.1
## 38336        156         879 0.9485135 0.7907496 5.1 79.1
## 38437        156         880 0.9485629 0.7905895 5.1 79.1
## Legend plot
plot(NA,NA,xlim=0:1,ylim=0:1,yaxt='n',xaxt='n',ylab='',xlab='',bty='n')

lgd_ = rep(NA, 15)
lgd_[c(1,8,15)] = c(0,8,15)
legend('left',title = 'False positive rate\n',
       legend = lgd_,
       fill = hcl.colors(15, "YlGn", rev = TRUE),
       bty='n',x.intersp =.5,
       border = NA,inset = 0.01,
       y.intersp = 0.5,
       cex = 1, text.font = 1)

lgd_ = rep(NA, 15)
lgd_[c(1,8,15)] = c(90,70,50)
legend('right',title = 'True positive rate\n',
       legend = lgd_,
       fill = hcl.colors(15, "Blue-Red 3", rev = F),
       bty='n',x.intersp =.5,
       border = NA,inset = 0.01,
       y.intersp = 0.5,
       cex = 1, text.font = 1)